%load_ext autoreload
%autoreload 2
Derive information and insights in order to improve our understanding and inform decision making
This talk is about showing you Python tools for how to achieve these goals, and some tips for how to use them.
Dataset: Melbourne City Council Pedestrian Counting System dataset
Contains hourly counts of footfalls across sensors located around Melbourne CDB from 2009 to now.
from pathlib import Path
DATA_PATH = Path("data/Pedestrian_Counting_System___2009_to_Present__counts_per_hour_.csv")
import pandas as pd
data_df = pd.read_csv(DATA_PATH)
data_df.head()
def load_and_clean_pedestrian_data(path):
df = pd.read_csv(path)
df["datetime"] = pd.to_datetime(
{
"day": df["Mdate"],
"year": df["Year"],
"hour": df["Time"],
"month": pd.to_datetime(df["Month"], format='%B').dt.month
}
)
return df
data_df = load_and_clean_pedestrian_data(DATA_PATH)
data_df.head()
Q: How many years does the dataset cover?
YEARS = sorted(data_df["Year"].unique())
YEARS
Q: How many sensors are in the datset?
SENSORS = data_df["Sensor_Name"].unique()
SENSORS.shape
Q: How many people are recorded each year?
year_counts = data_df.groupby("Year")["Hourly_Counts"].sum()
year_counts
Just eyeballing the data isn't going to cut it. Let's visualise with Pandas' plotting API.
This uses the Matplotlib library, so first set it up and configure. The first line below tells Jupyter to automatically render Matplotlib plots in the cell's output.
%matplotlib inline
import matplotlib.pyplot as plt
year_counts.plot(title="Total number of footfalls by year")
Tip: always title your plots.
Problem is that the number of sensors increases.
num_sensors = data_df.groupby("Year")["Sensor_Name"].nunique()
num_sensors.plot(title="Total number of sensors by year");
Let's normalise by number of sensors.
year_counts_df = data_df.groupby("Year").agg({"Hourly_Counts":sum, "Sensor_Name": "nunique"})
year_counts_df["count_per_sensor"] = year_counts_df["Hourly_Counts"] / year_counts_df["Sensor_Name"]
year_counts_df["count_per_sensor"].plot(title="Yearly footfalls/sensers");
Not clear how helpful this is... maybe newer sensors are more likely to be in less travelled areas and our footfalls shouldn't be spread across them to the same weight as more trafficked sensors.
# Build up a DataFrame of sensor names with their average yearly counts and first year of existence:
# get average yearly counts
res_df = pd.DataFrame(
data_df.groupby("Sensor_Name").apply(
lambda df:df.groupby("Year")["Hourly_Counts"].sum().mean()
),
columns=["average_yearly_counts"]
)
# get first year of existence for each sensor
sorted_df = data_df.sort_values(by="Year")
res_df["first_year"] = [
sorted_df[sorted_df["Sensor_Name"] == sensor].iloc[0]["Year"]
for sensor in res_df.index
]
# can also use df.plot.scatter()
res_df.plot(kind="scatter", x="first_year", y="average_yearly_counts", title="Average yearly sensor counts compared with sensor first year");
print(plt.style.available)
plt.style.use('seaborn')
plot() produces Matplotlib objects Can break out into full Matplotlib APIfrom matplotlib import ticker
# Tip: wrap up code to make a plot into a function. Useful for:
# - parameterising you plot
# - not polluting the global namespace/clobbering other identifiers
def make_top_sensor_plot(df, num_sensors=8):
# make and configure our split figure
plt.rcParams.update(
{"figure.titlesize": 22, "axes.titlesize": 18, "axes.labelsize": 18,
"legend.fontsize": 12, "xtick.labelsize":13, "ytick.labelsize":13}
)
fig, (ax1, ax2) = plt.subplots(1, 2)
plt.subplots_adjust(wspace=.4)
fig.suptitle(f"Top {num_sensors} most trafficked sensors")
ax1.set_title("Yearly footfalls")
ax2.set_title("Total footfalls")
# make numeric axes comma separated integers
ax1.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
ax2.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
# filter the data to the top 8 busiest sensors
sensor_counts = df.groupby("Sensor_Name")["Hourly_Counts"].sum().nlargest(8)
top_df = df[df["Sensor_Name"].isin(set(sensor_counts.index))]
# plot their total counts by year in axis 1
top_df.groupby("Sensor_Name").apply(
lambda df:df.groupby("Year")["Hourly_Counts"].sum()
).unstack().transpose().plot(ax=ax1, figsize=(30, 10))
# plot their aggregate counts in axis 2
sensor_counts.sort_values(ascending=True).plot.barh(ax=ax2);
make_top_sensor_plot(data_df)
Seaborn is a library built on top of Matplotlib.
Provides:
Like Matplotlib, only produces static images
Links
# Ridgeplot
# https://seaborn.pydata.org/examples/kde_ridgeplot.html
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# Create the data
rs = np.random.RandomState(1979)
x = rs.randn(500)
g = np.tile(list("ABCDEFGHIJ"), 50)
rand_df = pd.DataFrame(dict(x=x, g=g))
m = rand_df.g.map(ord)
rand_df["x"] += m
# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(rand_df, row="g", hue="g", aspect=15, height=.5, palette=pal)
# Draw the densities in a few steps
g.map(sns.kdeplot, "x", clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2)
g.map(sns.kdeplot, "x", clip_on=False, color="w", lw=2, bw=.2)
g.map(plt.axhline, y=0, lw=2, clip_on=False)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(0, .2, label, fontweight="bold", color=color,
ha="left", va="center", transform=ax.transAxes)
g.map(label, "x")
# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.25)
# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True);
# clustermap
# https://seaborn.pydata.org/examples/structured_heatmap.html
import pandas as pd
import seaborn as sns
sns.set()
# Load the brain networks example dataset
brain_df = sns.load_dataset("brain_networks", header=[0, 1, 2], index_col=0)
# Select a subset of the networks
used_networks = [1, 5, 6, 7, 8, 12, 13, 17]
used_columns = (brain_df.columns.get_level_values("network")
.astype(int)
.isin(used_networks))
brain_df = brain_df.loc[:, used_columns]
# Create a categorical palette to identify the networks
network_pal = sns.husl_palette(8, s=.45)
network_lut = dict(zip(map(str, used_networks), network_pal))
# Convert the palette to vectors that will be drawn on the side of the matrix
networks = brain_df.columns.get_level_values("network")
network_colors = pd.Series(networks, index=brain_df.columns).map(network_lut)
# Draw the full plot
sns.clustermap(brain_df.corr(), center=0, cmap="vlag",
row_colors=network_colors, col_colors=network_colors,
linewidths=.75, figsize=(13, 13));
Bokeh
Holoviews
Recomendation: Use Holoviews for analysis, falling back to Bokeh if needed
Links
# Density Grid
# https://holoviews.org/gallery/demos/bokeh/iris_density_grid.html#demos-bokeh-gallery-iris-density-grid
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.sampledata.iris import flowers
from holoviews.operation import gridmatrix
iris_ds = hv.Dataset(flowers)
density_grid = gridmatrix(iris_ds, diagonal_type=hv.Distribution, chart_type=hv.Bivariate)
point_grid = gridmatrix(iris_ds, chart_type=hv.Points)
(density_grid * point_grid).opts(
opts.Bivariate(bandwidth=0.5, cmap='Blues'),
opts.Points(size=2, tools=['box_select']))
# Chloropleth
# https://holoviews.org/gallery/demos/bokeh/texas_choropleth_example.html#demos-bokeh-gallery-texas-choropleth-example
# uncomment on first run
#import bokeh
#bokeh.sampledata.download()
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.sampledata.us_counties import data as counties
from bokeh.sampledata.unemployment import data as unemployment
counties = [dict(county, Unemployment=unemployment[cid])
for cid, county in counties.items()
if county["state"] == "tx"]
choropleth = hv.Polygons(counties, ['lons', 'lats'], [('detailed name', 'County'), 'Unemployment'])
choropleth.opts(
opts.Polygons(logz=True, tools=['hover'], xaxis=None, yaxis=None,
show_grid=False, show_frame=False, width=500, height=500,
color_index='Unemployment', colorbar=True, toolbar='above', line_color='white'))
# Topographic hillshading
# https://holoviews.org/gallery/demos/bokeh/topographic_hillshading.html#demos-bokeh-gallery-topographic-hillshading
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cbook import get_sample_data
from matplotlib.colors import LightSource
dem = np.load(get_sample_data('jacksboro_fault_dem.npz'))
z = dem['elevation']
dx, dy = dem['dx'], dem['dy']
dy = 111200 * dy
dx = 111200 * dx * np.cos(np.radians(dem['ymin']))
# Shade from the northwest, with the sun 45 degrees from horizontal
ls = LightSource(azdeg=315, altdeg=45)
cmap = plt.cm.gist_earth
# Vary vertical exaggeration and blend mode and plot all combinations
grid = hv.GridMatrix(kdims=['Vertical exaggeration', 'Blend mode', ])
for ve in [0.1, 1, 10]:
# Show the hillshade intensity image in the first row
grid['None', ve] = hv.Image(ls.hillshade(z, vert_exag=ve, dx=dx, dy=dy))
# Place hillshaded plots with different blend modes in the rest of the rows
for mode in ['hsv', 'overlay', 'soft']:
rgb = ls.shade(z, cmap=cmap, blend_mode=mode,
vert_exag=ve, dx=dx, dy=dy)
grid[mode, ve] = hv.RGB(rgb)
grid.opts(
opts.GridMatrix(xaxis='bottom', yaxis='left', shared_xaxis=False, shared_yaxis=False),
opts.Image(cmap='gray'))